#!/usr/bin/perl  -w
##
# Copyright (c) 2015-2016 Advanced Micro Devices, Inc. All rights reserved.
#
# Permission is hereby granted, free of charge, to any person obtaining a copy
# of this software and associated documentation files (the "Software"), to deal
# in the Software without restriction, including without limitation the rights
# to use, copy, modify, merge, publish, distribute, sublicense, and/or sell
# copies of the Software, and to permit persons to whom the Software is
# furnished to do so, subject to the following conditions:
#
# The above copyright notice and this permission notice shall be included in
# all copies or substantial portions of the Software.
#
# THE SOFTWARE IS PROVIDED "AS IS", WITHOUT WARRANTY OF ANY KIND, EXPRESS OR
# IMPLIED, INCLUDING BUT NOT LIMITED TO THE WARRANTIES OF MERCHANTABILITY,
# FITNESS FOR A PARTICULAR PURPOSE AND NONINFRINGEMENT.  IN NO EVENT SHALL THE
# AUTHORS OR COPYRIGHT HOLDERS BE LIABLE FOR ANY CLAIM, DAMAGES OR OTHER
# LIABILITY, WHETHER IN AN ACTION OF CONTRACT, TORT OR OTHERWISE, ARISING FROM,
# OUT OF OR IN CONNECTION WITH THE SOFTWARE OR THE USE OR OTHER DEALINGS IN
# THE SOFTWARE.
##
#usage hipify-perl [OPTIONS] INPUT_FILE
use Getopt::Long;

my $warn_whitelist ="";
GetOptions(
      "print-stats" => \$print_stats                   # print the command-line, like a header.
    , "count-conversions" => \$count_conversions       # count conversions.
    , "quiet-warnings" => \$quiet_warnings           # don't print warnings on unknown CUDA functions.
    , "warn-whitelist=s"=> \$warn_whitelist
    , "no-translate-builtins" => \$no_translate_builtins      # don't translate math functions.
    , "no-translate-textures" => \$no_translate_textures
    , "no-output" => \$no_output                      # don't write any translated output to stdout.
    , "inplace" => \$inplace                          # modify input file inplace, replacing input with hipified output, save backup in ".prehip" file.
														# If .prehip file exists, use that as input to hip.
    , "n" => \$n                                      # combination of print_stats + no-output.
);

$print_stats = 1 if $n;
$no_output   = 1 if $n;

# These uses of cuda[A-Z] are commonly used in CUDA code but don't actually map to any CUDA API:
# TODO - use a hash lookup for these.
@warn_whitelist = (
         "cudaError"
        ,"cudaStatus"
        ,"cudaDevice"
        ,"cudaDevice_t"

        ,"cudaIDs"
        ,"cudaGridDim"
        ,"cudaDimGrid"
        ,"cudaDimBlock"
        ,"cudaDeviceId"
        ,"cudaDevices",

        ,"cudaGradOutput",
        ,"cudaInput",
        ,"cudaOutput",
        ,"cudaGradInput",
        ,"cudaIndices",

        ,"cudaColorSpinorField"
        ,"cudaGaugeField"
        ,"cudaMom"
        ,"cudaGauge"
        ,"cudaInGauge"
        ,"cudaGaugeField"
        ,"cudaColorSpinorField"
        ,"cudaSiteLink"
        ,"cudaFatLink"
        ,"cudaStaple"
        ,"cudaCloverField"
        ,"cudaFatLink"
        ,"cudaParam"
    );
#print "WW=@warn_whitelist\n";


# Allow users to add their own functions.
push (@warn_whitelist, split(',',$warn_whitelist));


#---
#Stats tracking code:
@statNames = ("dev", "mem", "kern", 'coord_func', "math_func", "special_func", "stream", "event", "err", "def", "tex", "extern_shared", "other");


#---
#Compute total of all individual counts:
sub totalStats
{
    my %count = %{ shift() };

    my $total = 0;
    foreach $key (keys %count) {
        $total += $count{$key};
    }

    return $total;
};

#---
sub printStats
{
    my $label =     shift();
    my @statNames = @{ shift() };
    my %counts =    %{ shift() };
    my $warnings =  shift();
    my $loc       = shift();

    my $total = totalStats(\%counts);

    printf STDERR "%s %d CUDA->HIP refs( ", $label, $total;

    foreach $stat (@statNames) {
        printf STDERR "%s:%d ", $stat, $counts{$stat};
    }

    printf STDERR ") warn:%d LOC:%d", $warnings, $loc;
}


#---
# Add adder stats to dest.  Used to add stats for current file to a running total for all files:
sub addStats
{
    my $dest_ref  =  shift() ;
    my %adder = %{ shift() };

    foreach $key (keys %adder) {
        $dest_ref->{$key} += $adder{$key};
        #printf ("D{$key} += %d => %d\n", $adder{$key}, $dest{$key});
    }
}


#---
sub clearStats
{
    my $dest_ref  =  shift() ;
    my @statNames = @{ shift() };

    foreach $stat (@statNames) {
        $dest_ref->{$stat}  = 0;
    }
}




# count of transforms in all files:
my %tt;
clearStats(\%tt, \@statNames);



$Twarnings = 0;
$TlineCount = 0;
my %TwarningTags ;
my %Tkernels ;

my $fileCount = @ARGV;
my $fileName = "";

while (@ARGV) {
    $fileName=shift (@ARGV);


    if ($inplace) {
        my $file_prehip = "$fileName" . ".prehip";
		my $infile;
		my $outfile;
		if (-e $file_prehip) {
			$infile  = $file_prehip;
			$outfile = $fileName;
		} else {
			system ("cp $fileName $file_prehip");
			$infile = $file_prehip;
			$outfile = $fileName;
		}

        open(INFILE,"<", $infile) or die "error: could not open $infile";
        open(OUTFILE,">", $outfile) or die "error: could not open $outfile";
        $OUTFILE = OUTFILE;

    } else {
        open(INFILE,"<", $fileName) or die "error: could not open $fileName";
        $OUTFILE = STDOUT;
    }





    # Note : \b is used in perl to indicate the start of a word - typically that is what we want in this case:
    #

    # count of transforms in this file, init to 0 here:
    my %ft;
    clearStats(\%ft, \@statNames);
    my $countIncludes = 0;
    my $countKeywords = 0; # keywords like __global__, __shared__ - not converted by hipify-perl, but counted here.
    my $warnings = 0;
    my $warningsCublas = 0;
    my $warningsCurand = 0;
    my %warningTags;  # hash with counts of particular unknown keywords.

    my $lineCount = 0;

    undef $/; # Read whole file at once, so we can match newlines.
    while (<INFILE>)
    {

        #--------
        # Compiler Defines
        # __CUDACC__ is set by NVCC to indicate it is treating the input file as CUDA code (as opposed to host)
        # Typically we want any code treated as CUDA code to be treated as accelerator code by Kalmar too
        # __HIPCC__ will set KALMARCC
        $ft{'def'} += s/\b__CUDACC__\b/__HIPCC__/g;

        # __CUDA_ARCH is often used to detect when a function or kernel is being compiled for the device.
        # Don't automaticall convert this - likely these will need special attention with HIP_ARCH_HAS_* macros
        #$ft{'def'} += s/\b__CUDA_ARCH__\b/__HIP_ARCH__/g;



        #--------
        #Includes:
        $countIncludes += s/(\s*#\s*include\s+)[<"]cuda_runtime\.h[>"]/$1<hip\/hip_runtime.h>/;
        $countIncludes += s/(\s*#\s*include\s+)[<"]cuda_runtime_api\.h[>"]/$1<hip\/hip_runtime_api.h>/;
        $countIncludes += s/(\s*#\s*include\s+)[<"]cuda_fp16\.h[>"]/$1<hip\/hip_fp16.h>/;


        #--------
        # Error codes and return types:
        $ft{'err'} += s/\bcudaError_t\b/hipError_t/g;
        $ft{'err'} += s/\bcudaError\b/hipError_t/g;
        $ft{'err'} += s/\bcudaSuccess\b/hipSuccess/g;

        $ft{'err'} += s/\bcudaErrorUnknown\b/hipErrorUnknown/g;
        $ft{'err'} += s/\bcudaErrorMemoryAllocation\b/hipErrorMemoryAllocation/g;
        $ft{'err'} += s/\bcudaErrorMemoryFree\b/hipErrorMemoryFree/g;
        $ft{'err'} += s/\bcudaErrorUnknownSymbol\b/hipErrorUnknownSymbol/g;
        $ft{'err'} += s/\bcudaErrorOutOfResources\b/hipErrorOutOfResources/g;
        $ft{'err'} += s/\bcudaErrorInvalidValue\b/hipErrorInvalidValue/g;
        $ft{'err'} += s/\bcudaErrorInvalidResourceHandle\b/hipErrorInvalidResourceHandle/g;
        $ft{'err'} += s/\bcudaErrorInvalidDevice\b/hipErrorInvalidDevice/g;
        $ft{'err'} += s/\bcudaErrorInvalidDevicePointer\b/hipErrorInvalidDevicePointer/g;
        $ft{'err'} += s/\bcudaErrorNoDevice\b/hipErrorNoDevice/g;
        $ft{'err'} += s/\bcudaErrorNotReady\b/hipErrorNotReady/g;
        $ft{'err'} += s/\bcudaErrorUnknown\b/hipErrorUnknown/g;
        $ft{'err'} += s/\bcudaErrorPeerAccessAlreadyEnabled\b/hipErrorPeerAccessAlreadyEnabled/g;

		# error APIs:
        $ft{'err'} += s/\bcudaGetLastError\b/hipGetLastError/g;
        $ft{'err'} += s/\bcudaPeekAtLastError\b/hipPeekAtLastError/g;
        $ft{'err'} += s/\bcudaGetErrorName\b/hipGetErrorName/g;
        $ft{'err'} += s/\bcudaGetErrorString\b/hipGetErrorString/g;



        #--------
        # Memcpy
        $ft{'mem'} += s/\bcudaMemcpy\b/hipMemcpy/g;
        $ft{'mem'} += s/\bcudaMemcpyHostToHost\b/hipMemcpyHostToHost/g;
        $ft{'mem'} += s/\bcudaMemcpyHostToDevice\b/hipMemcpyHostToDevice/g;
        $ft{'mem'} += s/\bcudaMemcpyDeviceToHost\b/hipMemcpyDeviceToHost/g;
        $ft{'mem'} += s/\bcudaMemcpyDeviceToDevice\b/hipMemcpyDeviceToDevice/g;
        $ft{'mem'} += s/\bcudaMemcpyDefault\b/hipMemcpyDefault/g;
        $ft{'mem'} += s/\bcudaMemcpyToSymbol\b/hipMemcpyToSymbol/g;

        $ft{'mem'} += s/\bcudaMemset\b/hipMemset/g;
        $ft{'mem'} += s/\bcudaMemsetAsync\b/hipMemsetAsync/g;

        $ft{'mem'} += s/\bcudaMemcpyAsync\b/hipMemcpyAsync/g;

        $ft{'mem'} += s/\bcudaMemGetInfo\b/hipMemGetInfo/g;

        $ft{'mem'} += s/\bcudaMemcpyKind\b/hipMemcpyKind/g;

        $ft{'mem'} += s/\bcudaPointerAttributes\b/hipPointerAttribute_t/g;
        $ft{'mem'} += s/\bcudaPointerGetAttributes\b/hipPointerGetAttributes/g;

        $ft{'mem'} += s/\bcudaMemcpy2D\b/hipMemcpy2D/g;
        $ft{'mem'} += s/\bcudaMemcpy2DToArray\b/hipMemcpy2DToArray/g;
        $ft{'mem'} += s/\bcudaMemcpyToArray\b/hipMemcpyToArray/g;

        $ft{'mem'} += s/\bcudaGetSymbolAddress\b/hipGetSymbolAddress/g;
        $ft{'mem'} += s/\bcudaGetSymbolSize\b/hipGetSymbolSize/g;

        #--------
        # Memory management:
        $ft{'mem'} += s/\bcudaMalloc\b/hipMalloc/g;
        $ft{'mem'} += s/\bcudaMallocHost\b/hipHostMalloc/g;  # note conversion to standard hipHost* naming convention
        $ft{'mem'} += s/\bcudaFree\b/hipFree/g;
        $ft{'mem'} += s/\bcudaFreeHost\b/hipHostFree/g;    # note conversion to standard hipHost* naming convention
		$ft{'mem'} += s/\bcudaHostAlloc\b/hipHostMalloc/g;
		$ft{'mem'} += s/\bcudaHostGetDevicePointer\b/hipHostGetDevicePointer/g;
		$ft{'mem'} += s/\bcudaHostAllocDefault\b/hipHostMallocDefault/g;
		$ft{'mem'} += s/\bcudaHostAllocPortable\b/hipHostMallocPortable/g;
		$ft{'mem'} += s/\bcudaHostAllocMapped\b/hipHostMallocMapped/g;
		$ft{'mem'} += s/\bcudaHostAllocWriteCombined\b/hipHostMallocWriteCombined/g;
		$ft{'mem'} += s/\bcudaHostRegisterMapped\b/hipHostRegisterMapped/g;
        $ft{'mem'} += s/\bcudaHostRegister\b/hipHostRegister/g;
        $ft{'mem'} += s/\bcudaHostUnregister\b/hipHostUnregister/g;
        $ft{'mem'} += s/\bcudaHostGetDevicePointer\b/hipHostGetDevicePointer/g;

        $ft{'mem'} += s/\bcudaMallocArray\b/hipMallocArray/g;
        $ft{'mem'} += s/\bcudaFreeArray\b/hipFreeArray/g;
        $ft{'mem'} += s/\bcudaMallocPitch\b/hipMallocPitch/g;


        #--------
        # Events
        $ft{'event'} += s/\bcudaEvent_t\b/hipEvent_t/g;
        $ft{'event'} += s/\bcudaEventCreate\b/hipEventCreate/g;
        $ft{'event'} += s/\bcudaEventCreateWithFlags\b/hipEventCreateWithFlags/g;
        $ft{'event'} += s/\bcudaEventDestroy\b/hipEventDestroy/g;
        $ft{'event'} += s/\bcudaEventRecord\b/hipEventRecord/g;
        $ft{'event'} += s/\bcudaEventElapsedTime\b/hipEventElapsedTime/g;
        $ft{'event'} += s/\bcudaEventSynchronize\b/hipEventSynchronize/g;
        $ft{'event'} += s/\bcudaEventDisableTiming\b/hipEventDisableTiming/g;
        $ft{'event'} += s/\bcudaEventQuery\b/hipEventQuery/g;

        #--------
        # Streams
        $ft{'stream'} += s/\bcudaStream_t\b/hipStream_t/g;
        $ft{'stream'} += s/\bcudaStreamCreate\b/hipStreamCreate/g;
        $ft{'stream'} += s/\bcudaStreamCreateWithFlags\b/hipStreamCreateWithFlags/g;
        $ft{'stream'} += s/\bcudaStreamDestroy\b/hipStreamDestroy/g;
        $ft{'stream'} += s/\bcudaStreamWaitEvent\b/hipStreamWaitEvent/g;
        $ft{'stream'} += s/\bcudaStreamSynchronize\b/hipStreamSynchronize/g;
        $ft{'stream'} += s/\bcudaStreamDefault\b/hipStreamDefault/g;
        $ft{'stream'} += s/\bcudaStreamNonBlocking\b/hipStreamNonBlocking/g;


        #--------
        # Other synchronization
        $ft{'dev'} += s/\bcudaDeviceSynchronize\b/hipDeviceSynchronize/g;
        $ft{'dev'} += s/\bcudaThreadSynchronize\b/hipDeviceSynchronize/g;  # translate deprecated cudaThreadSynchronize
        $ft{'dev'} += s/\bcudaDeviceReset\b/hipDeviceReset/g;
        $ft{'dev'} += s/\bcudaThreadExit\b/hipDeviceReset/g;               # translate deprecated cudaThreadExit
        $ft{'dev'} += s/\bcudaSetDevice\b/hipSetDevice/g;
        $ft{'dev'} += s/\bcudaGetDevice\b/hipGetDevice/g;

        #--------
        # Device
        $ft{'dev'} += s/\bcudaDeviceProp\b/hipDeviceProp_t/g;
        $ft{'dev'} += s/\bcudaGetDeviceProperties\b/hipGetDeviceProperties/g;
        $ft{'dev'} += s/\bcudaDeviceGetPCIBusId\b/hipDeviceGetPCIBusId/g;

        # Attribute
        $ft{'err'} += s/\bcudaDevAttrMaxThreadsPerBlock\b/hipDeviceAttributeMaxThreadsPerBlock/g;
        $ft{'err'} += s/\bcudaDevAttrMaxBlockDimX\b/hipDeviceAttributeMaxBlockDimX/g;
        $ft{'err'} += s/\bcudaDevAttrMaxBlockDimY\b/hipDeviceAttributeMaxBlockDimY/g;
        $ft{'err'} += s/\bcudaDevAttrMaxBlockDimZ\b/hipDeviceAttributeMaxBlockDimZ/g;
        $ft{'err'} += s/\bcudaDevAttrMaxGridDimX\b/hipDeviceAttributeMaxGridDimX/g;
        $ft{'err'} += s/\bcudaDevAttrMaxGridDimY\b/hipDeviceAttributeMaxGridDimY/g;
        $ft{'err'} += s/\bcudaDevAttrMaxGridDimZ\b/hipDeviceAttributeMaxGridDimZ/g;
        $ft{'err'} += s/\bcudaDevAttrMaxSharedMemoryPerBlock\b/hipDeviceAttributeMaxSharedMemoryPerBlock/g;
        $ft{'err'} += s/\bcudaDevAttrTotalConstantMemory\b/hipDeviceAttributeTotalConstantMemory/g;
        $ft{'err'} += s/\bcudaDevAttrWarpSize\b/hipDeviceAttributeWarpSize/g;
        $ft{'err'} += s/\bcudaDevAttrMaxRegistersPerBlock\b/hipDeviceAttributeMaxRegistersPerBlock/g;
        $ft{'err'} += s/\bcudaDevAttrClockRate\b/hipDeviceAttributeClockRate/g;
        $ft{'err'} += s/\bcudaDevAttrMultiProcessorCount\b/hipDeviceAttributeMultiprocessorCount/g;
        $ft{'err'} += s/\bcudaDevAttrComputeMode\b/hipDeviceAttributeComputeMode/g;
        $ft{'err'} += s/\bcudaDevAttrL2CacheSize\b/hipDeviceAttributeL2CacheSize/g;
        $ft{'err'} += s/\bcudaDevAttrMaxThreadsPerMultiProcessor\b/hipDeviceAttributeMaxThreadsPerMultiProcessor/g;
        $ft{'err'} += s/\bcudaDevAttrComputeCapabilityMajor\b/hipDeviceAttributeComputeCapabilityMajor/g;
        $ft{'err'} += s/\bcudaDevAttrComputeCapabilityMinor\b/hipDeviceAttributeComputeCapabilityMinor/g;
        $ft{'err'} += s/\bcudaDevAttrConcurrentKernels\b/hipDeviceAttributeConcurrentKernels/g;
        $ft{'err'} += s/\bcudaDevAttrPciBusId\b/hipDeviceAttributePciBusId/g;
        $ft{'err'} += s/\bcudaDevAttrPciDeviceId\b/hipDeviceAttributePciDeviceId/g;
        $ft{'err'} += s/\bcudaDevAttrMaxSharedMemoryPerMultiprocessor\b/hipDeviceAttributeMaxSharedMemoryPerMultiprocessor/g;
        $ft{'err'} += s/\bcudaDevAttrMemoryClockRate\b/hipDeviceAttributeMemoryClockRate/g;
        $ft{'err'} += s/\bcudaDevAttrGlobalMemoryBusWidth\b/hipDeviceAttributeMemoryBusWidth/g;
        $ft{'dev'} += s/\bcudaDeviceAttr\b/hipDeviceAttribute_t/g;
        $ft{'dev'} += s/\bcudaDeviceGetAttribute\b/hipDeviceGetAttribute/g;

        # Cache config
        $ft{'dev'} += s/\bcudaDeviceSetCacheConfig\b/hipDeviceSetCacheConfig/g;
        $ft{'dev'} += s/\bcudaThreadSetCacheConfig\b/hipDeviceSetCacheConfig/g; # translate deprecated
        $ft{'dev'} += s/\bcudaDeviceGetCacheConfig\b/hipDeviceGetCacheConfig/g;
        $ft{'dev'} += s/\bcudaThreadGetCacheConfig\b/hipDeviceGetCacheConfig/g; # translate deprecated
        $ft{'dev'} += s/\bcudaFuncCache\b/hipFuncCache/g;
        $ft{'dev'} += s/\bcudaFuncCachePreferNone\b/hipFuncCachePreferNone/g;
        $ft{'dev'} += s/\bcudaFuncCachePreferShared\b/hipFuncCachePreferShared/g;
        $ft{'dev'} += s/\bcudaFuncCachePreferL1\b/hipFuncCachePreferL1/g;
        $ft{'dev'} += s/\bcudaFuncCachePreferEqual\b/hipFuncCachePreferEqual/g;
        # function
        $ft{'dev'} += s/\bcudaFuncSetCacheConfig\b/hipFuncSetCacheConfig/g;


        $ft{'dev'} += s/\bcudaDriverGetVersion\b/hipDriverGetVersion/g;

        #--------
        # Peer2Peer
        $ft{'dev'} += s/\bcudaDeviceCanAccessPeer\b/hipDeviceCanAccessPeer/g;
        $ft{'dev'} += s/\bcudaDeviceDisablePeerAccess\b/hipDeviceDisablePeerAccess/g;
        $ft{'dev'} += s/\bcudaDeviceEnablePeerAccess\b/hipDeviceEnablePeerAccess/g;
        $ft{'mem'} += s/\bcudaMemcpyPeerAsync\b/hipMemcpyPeerAsync/g;
        $ft{'mem'} += s/\bcudaMemcpyPeer\b/hipMemcpyPeer/g;
        $ft{'mem'} += s/\bcudaIpcOpenMemHandle\b/hipIpcOpenMemHandle/g;
        $ft{'mem'} += s/\bcudaIpcCloseMemHandle\b/hipIpcCloseMemHandle/g;
        $ft{'mem'} += s/\bcudaIpcGetMemHandle\b/hipIpcGetMemHandle/g;
        $ft{'mem'} += s/\bcudaIpcMemHandle_t\b/hipIpcMemHandle_t/g;
        $ft{'mem'} += s/\bcudaIpcMemLazyEnablePeerAccess\b/hipIpcMemLazyEnablePeerAccess/g;


        # Shared mem:
        $ft{'dev'} += s/\bcudaDeviceSetSharedMemConfig\b/hipDeviceSetSharedMemConfig/g;
        $ft{'dev'} += s/\bcudaThreadSetSharedMemConfig\b/hipDeviceSetSharedMemConfig/g; # translate deprecated
        $ft{'dev'} += s/\bcudaDeviceGetSharedMemConfig\b/hipDeviceGetSharedMemConfig/g;
        $ft{'dev'} += s/\bcudaThreadGetSharedMemConfig\b/hipDeviceGetSharedMemConfig/g;  # translate deprecated
        $ft{'dev'} += s/\bcudaSharedMemConfig\b/hipSharedMemConfig/g;
        $ft{'dev'} += s/\bcudaSharedMemBankSizeDefault\b/hipSharedMemBankSizeDefault/g;
        $ft{'dev'} += s/\bcudaSharedMemBankSizeFourByte\b/hipSharedMemBankSizeFourByte/g;
        $ft{'dev'} += s/\bcudaSharedMemBankSizeEightByte\b/hipSharedMemBankSizeEightByte/g;

        $ft{'dev'} += s/\bcudaGetDeviceCount\b/hipGetDeviceCount/g;

        #--------
        # Profiler
        #$aOt += s/\bcudaProfilerInitialize\b/hipProfilerInitialize/g;  // see if these are called anywhere.
        $ft{'other'} += s/\bcudaProfilerStart\b/hipProfilerStart/g;
        $ft{'other'} += s/\bcudaProfilerStop\b/hipProfilerStop/g;



        $countKeywords += m/__global__/;
        $countKeywords += m/__shared__/;

        #--------
        # CUDA extern __shared__ syntax
        # Note these only work if declaration is on a single line.
        {
            # match uses ? for <.*> which will be unitialized if this is not present in launch syntax.
            no warnings qw/uninitialized/;

            my $k = 0;

            # Match extern __shared__ type foo[]; syntax
            # Replace as HIP_DYNAMIC_SHARED() macro
            $k += s/extern\s+([\w\(\)]+)?\s*__shared__\s+([\w:<>\s]+)\s+(\w+)\s*\[\s*\]\s*;/HIP_DYNAMIC_SHARED($1 $2, $3)/g;

            # test patterns for the regular expression above:
            #'extern __shared__ double foo[];'
            #'extern __shared__ unsigned int foo[];'
            #'extern volatile __shared__ double foo[];'
            #'extern volatile __shared__ unsigned int sdata[];'
            #'extern __shared__ volatile unsigned int sdata[];'
            #'extern __shared__ T s[];'
            #'extern __shared__ T::type s[];'
            #'extern __shared__ blah<T>::type s[];'
            #'extern __shared__ typename mapper<Float>::type s_data[];'
            #'extern __attribute__((used)) __shared__ typename mapper<Float>::type s_data[];'

            $ft{'extern_shared'} += $k;
        }

        #--------
        # CUDA Launch Syntax
        # Note these only work if launch is on a single line.

        # Handle the <<numBlocks, blockDim>>> syntax:
        {
            # match uses ? for <.*> which will be unitialized if this is not present in launch syntax.
            no warnings qw/uninitialized/;

            my $k = 0;
            my $kernelName;

            # Handle the <<numBlocks, blockDim, sharedSize, stream>>> syntax:
	    # $k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>([\s*\\]*)\(/hipLaunchKernelGGL(($1$2), dim3($3), dim3($4), $5, $6, /g;
	    $k += s/(\w+)\s*(<.*>)?\s*<<</hipLaunchKernelGGL(($1$2), /g;
	    $kernelName = $1 if $k;

            # Handle the <<numBlocks, blockDim, sharedSize>>> syntax:
	    # $k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*,\s*(.+)\s*>>>([\s*\\]*)\(/hipLaunchKernelGGL(($1$2), dim3($3), dim3($4), $5, 0, /g;
	    # $kernelName = $1 if $k;

            # Handle the <<numBlocks, blockDim>>> syntax:
	    # $k += s/(\w+)\s*(<.*>)?\s*<<<\s*(.+)\s*,\s*(.+)\s*>>>([\s\\]*)\(/hipLaunchKernelGGL(($1$2), dim3($3), dim3($4), 0, 0, /g;
	    # $kernelName = $1 if $k;

            $ft{'kern'} += $k;
            if ($k) {
                $Tkernels{$kernelName} ++;
            }
	    $k += s/>>>([\s*\\]*)\(\)/\)/g;
	    $k += s/>>>([\s*\\]*)\(/\, /g;
         }



        unless ($no_translate_textures) {
            $ft{'tex'} += s/\bcudaChannelFormatDesc\b/hipChannelFormatDesc/g;
            $ft{'tex'} += s/\bcudaFilterModePoint\b/hipFilterModePoint/g;
            $ft{'tex'} += s/\bcudaReadModeElementType\b/hipReadModeElementType/g;

            $ft{'tex'} += s/\bcudaArray\b/hipArray/g;
            $ft{'tex'} += s/\bcudaCreateChannelDesc\b/hipCreateChannelDesc/g;
            $ft{'tex'} += s/\bcudaBindTexture\b/hipBindTexture/g;
            $ft{'tex'} += s/\bcudaBindTextureToArray\b/hipBindTextureToArray/g;
            $ft{'tex'} += s/\bcudaUnbindTexture\b/hipUnbindTexture/g;
            $ft{'tex'} += s/\bcudaChannelFormatKindFloat\b/hipChannelFormatKindFloat/g;
            $ft{'tex'} += s/\bcudaAddressMode/hipAddressMode/g;
            $ft{'tex'} += s/\bcudaFilterMode/hipFilterMode/g;
        }


        if ($count_conversions) {
            while (/(\bhip[A-Z]\w+\b)/g) {
                $convertedTags{$1}++;

                #print STDERR "HIP: $1 : ", $translateTags{$1}, "\n";
            }
        }


        # guess that we are in device code , or at least in a file that calls device code.
        # will almost certainly call one of the coordiante functions - could be fooled by clever macros but usually works:
        my $hasDeviceCode = $countKeywords + $ft{'coord_func'} + $ft{'math_func'} + $ft{'special_func'};



        unless ($quiet_warnings) {
            #print STDERR "Check WARNINGs\n";
            # copy into array of lines, process line-by-line to show warnings:
            if ($hasDeviceCode or (/\bcuda/) or (/<<<.*>>>/) or (/(\bcublas[A-Z]\w+)/) or (/(\bcurand[A-Z]\w+)/) ) {
                my @lines = split /\n/, $_;
                my $tmp = $_;  # copies the whole file, could be a little smarter here...
                my $line_num = 0;

                foreach (@lines) {

                    $line_num ++;

                    # remove any whitelisted words:
                    foreach $w (@warn_whitelist) {
                        s/\b$w\b/ZAP/
                    }

					my $tag ;
                    if ((/(\bcuda[A-Z]\w+)/) or (/<<<.*>>>/)) {
                        # flag any remaining code that look like cuda API calls, may want to add these to hipify
                        $tag = (defined $1) ? $1 : "Launch";
					} elsif (/(\bcublas[A-Z]\w+)/) {
						# $warningsCublas++;
						# $tag = $1;
					} elsif (/(\bcurand[A-Z]\w+)/) {
						# $warningsCurand++;
						# $tag = $1;
					}

					if (defined $tag) {
                        $warnings++;
                        $warningTags{$tag}++;
                        print STDERR "  warning: $fileName:#$line_num : $_";
                        print STDERR "\n";
                    }

                    $s = warnUnsupportedSpecialFunctions($line_num);
                    $warnings += $s;


                }

                $_ = $tmp;
            }
        }




        #--------
        # Math libraries
        # To limit bogus translations, try to make sure we are in a kernel (ft{'builtin'} != 0):
        if (not $no_translate_builtins and ($hasDeviceCode > 0)) {
            $ft{'special_func'} += countSupportedSpecialFunctions();
        }


        print "#include <cuda.h>\n";

        #--------
        # Print it!
        # TODO - would like to move this code outside loop but it uses $_ which contains the whole file.
        unless ($no_output) {
            my $apiCalls   = $ft{'err'} + $ft{'event'} + $ft{'mem'} + $ft{'stream'} + $ft{'dev'} + $ft{'def'} +  $ft{'tex'} +  $ft{'other'} + $ft{'math_func'};
            my $kernStuff  = $hasDeviceCode + $ft{'kern'};
            my $totalCalls = $apiCalls + $kernStuff;

            $is_dos = m/\r\n$/;

            if ($totalCalls and ($countIncludes == 0) and ($kernStuff != 0)) {
                # If this file makes kernel builtin calls, and does not include the cuda_runtime.h,
                # then add an #include to match "magic" includes provided by NVCC.
				# This logic can miss cases where cuda_runtime.h is included by another include file.
                print $OUTFILE '#include "hip/hip_runtime.h"' . ($is_dos ? "\r\n" : "\n");
            }
            print $OUTFILE  "$_";
        }

        $lineCount = $_ =~ tr/\n//;

    }

    my $totalConverted = totalStats(\%ft);

    #printf "TOTAL-CONV=%d\n", $totalConverted;


    if (($totalConverted+$warnings) and $print_stats) {
        printStats("info: converted", \@statNames, \%ft, $warnings, $lineCount);
        print STDERR " in '$fileName'\n";
    }


    # Update totals for all files:
    addStats(\%tt, \%ft);
    $Twarnings += $warnings;
    $TlineCount += $lineCount;
    foreach $key (keys %warningTags) {
        $TwarningTags{$key} += $warningTags{$key};
    }
}

#-- Print total stats for all files processed:
if ($print_stats and ($fileCount > 1)) {
    print STDERR "\n";
    printStats("info: TOTAL-converted", \@statNames, \%tt, $Twarnings, $TlineCount);
    print STDERR "\n";

    foreach my $key (sort { $TwarningTags{$b} <=> $TwarningTags{$a} } keys %TwarningTags) {
        printf STDERR "  warning: unconverted %s : %d\n", $key, $TwarningTags{$key};
    }

    my $kernelCnt = keys %Tkernels;
    printf STDERR "  kernels (%d total) : ", $kernelCnt;
    foreach my $key (sort { $Tkernels{$b} <=> $Tkernels{$a} } keys %Tkernels) {
        printf STDERR "  %s(%d)", $key, $Tkernels{$key};
    }
    print STDERR "\n";


    print STDERR "\n";
}

if ($count_conversions) {
    foreach my $key (sort { $convertedTags{$b} <=> $convertedTags{$a} } keys %convertedTags) {
        printf STDERR "  %s %d\n", $key, $convertedTags{$key};
    }
}




sub countSupportedSpecialFunctions
{
    my $m = 0;

    #supported special functions:
    foreach $func (
        # Synchronization:
        "__syncthreads",
    )
    {
        # match math at the beginning of a word, but not if it already has a namespace qualifier ('::') :
        $m += m/[:]?[:]?\b($func)\b(\w*\()/g;

    }

    return $m;
}

sub warnUnsupportedSpecialFunctions
{
    my $line_num =  shift;

    my $m = 0;



    foreach $func (
        # memory fence:
        "__threadfence_block",
        "__threadfence",
        "__threadfence_system",

        # Synchronization:
        "__syncthreads_count",
        "__syncthreads_and",
        "__syncthreads_or",

        # Read-only cache function:
	# "__ldg",

        # Cross-lane and warp-vote instructions:
        #"__all",
        #"__any",
        #"__ballot",

        #"__popc",
        #"__clz",

        #"__shfl",
        #"__shfl_up",
        #"__shfl_down",
        #"__shfl_xor",

        "__prof_trigger",

        # too popular, and we can't tell if we are in device or host code.
        #"assert",
        #"printf",

        #"malloc",
        #"free",
        #"memset",
        #"memcpy"
    )
    {
        # match math at the beginning of a word, but not if it already has a namespace qualifier ('::') :
        my $mt = m/[:]?[:]?\b($func)\b(\w*\()/g;
        if  ($mt) {
            $m += $mt;
            print STDERR "  warning: $fileName:#$line_num : unsupported device function : $_\n";
        }

    }

    return $m;
}
